In [ ]:
""" This simple code is desinged to teach a basic user to read in the files in python, simply find what proportion of males and females survived and make a predictive model based on this
Author : AstroDave
Date : 18 September 2012
Revised: 28 March 2014
"""
In [1]:
import csv as csv
import numpy as np
In [2]:
csv_file_object = csv.reader(open('./data/train.csv', 'rb')) # Load in the csv file
header = csv_file_object.next() # Skip the fist line as it is a header
data =[] # Create a variable to hold the data
In [4]:
for row in csv_file_object: # Skip through each row in the csv file,
data.append(row[0:]) # adding each row to the data variable
data = np.array(data) # Then convert from a list to an array.
In [7]:
type(data)
Out[7]:
In [8]:
data.shape
Out[8]:
In [9]:
data
Out[9]:
I have an array of 12 columns and 891 rows.
I can access any element I want, so the entire first column would be data[0::,0].astype(np.float)
-- This means all of the rows (from start to end), in column 0 I have to add the .astype() command, because when appending the rows, python thought it was a string - so needed to convert
In [11]:
data[0::,0].astype(np.float)
Out[11]:
In [31]:
number_passengers = np.size(data[0::,1].astype(np.float))
number_survived = np.sum(data[0::,1].astype(np.float))
proportion_survivors = number_survived / number_passengers
print 'Proportion of Survivors: %s/%s = %s' % (number_passengers \
, number_survived \
, proportion_survivors)
In [20]:
women_only_stats = data[0::,4] == "female" # This finds where all the women are
men_only_stats = data[0::,4] != "female" # This finds where all the men are (note != means 'not equal')
Filter the whole data, to find statistics for just women, by just placing women_only_stats as a "mask" on my full data -- Use it in place of the '0::' part of the array index.
You can test it by placing it there, and requesting column index [4], and the output should all read 'female' e.g. try typing this: data[women_only_stats,4]
In [35]:
women_onboard = data[women_only_stats, 1].astype(np.float)
men_onboard = data[men_only_stats, 1].astype(np.float)
In [36]:
print 'Women onboard: %s' % women_onboard.size
print 'Men onboard: %s' % men_onboard.size
print 'Total onboard: %s' % (women_onboard.size+men_onboard.size)
In [37]:
proportion_women_survived = np.sum(women_onboard) / np.size(women_onboard)
proportion_men_survived = np.sum(men_onboard) / np.size(men_onboard)
In [28]:
print 'Proportion of women who survived is %s' % proportion_women_survived
print 'Proportion of men who survived is %s' % proportion_men_survived
Now that I have my indicator that women were much more likely to survive, I am done with the training set.
In [54]:
test_file = open('./data/test.csv', 'rb') # First, read in test.csv
test_file_object = csv.reader(test_file)
header = test_file_object.next()
In [55]:
header
Out[55]:
In [57]:
predictions_file = open("./models/jfaPythonBasicGenderModel.csv", "wb")
predictions_file_object = csv.writer(predictions_file)
predictions_file_object.writerow(["PassengerId", "Survived"]) # write the column headers
In [58]:
for row in test_file_object: # For each row in test file,
if row[3] == 'female': # is it a female, if yes then
predictions_file_object.writerow([row[0], "1"]) # write the PassengerId, and predict 1
else: # or else if male,
predictions_file_object.writerow([row[0], "0"]) # write the PassengerId, and predict 0.
In [59]:
test_file.close() # Close out the files.
predictions_file.close()
In [ ]:
In [ ]: